library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df = read.csv('/Users/ankitkumar/Desktop/Loan_Default.csv')
names(df) <- tolower(names(df))
glimpse(df)
## Rows: 148,670
## Columns: 34
## $ id <int> 24890, 24891, 24892, 24893, 24894, 24895, 24…
## $ year <int> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 20…
## $ loan_limit <chr> "cf", "cf", "cf", "cf", "cf", "cf", "cf", ""…
## $ gender <chr> "Sex Not Available", "Male", "Male", "Male",…
## $ approv_in_adv <chr> "nopre", "nopre", "pre", "nopre", "pre", "pr…
## $ loan_type <chr> "type1", "type2", "type1", "type1", "type1",…
## $ loan_purpose <chr> "p1", "p1", "p1", "p4", "p1", "p1", "p3", "p…
## $ credit_worthiness <chr> "l1", "l1", "l1", "l1", "l1", "l1", "l1", "l…
## $ open_credit <chr> "nopc", "nopc", "nopc", "nopc", "nopc", "nop…
## $ business_or_commercial <chr> "nob/c", "b/c", "nob/c", "nob/c", "nob/c", "…
## $ loan_amount <int> 116500, 206500, 406500, 456500, 696500, 7065…
## $ rate_of_interest <dbl> NA, NA, 4.560, 4.250, 4.000, 3.990, 4.500, 4…
## $ interest_rate_spread <dbl> NA, NA, 0.2000, 0.6810, 0.3042, 0.1523, 0.99…
## $ upfront_charges <dbl> NA, NA, 595.00, NA, 0.00, 370.00, 5120.00, 5…
## $ term <dbl> 360, 360, 360, 360, 360, 360, 360, 360, 360,…
## $ neg_ammortization <chr> "not_neg", "not_neg", "neg_amm", "not_neg", …
## $ interest_only <chr> "not_int", "not_int", "not_int", "not_int", …
## $ lump_sum_payment <chr> "not_lpsm", "lpsm", "not_lpsm", "not_lpsm", …
## $ property_value <dbl> 118000, NA, 508000, 658000, 758000, 1008000,…
## $ construction_type <chr> "sb", "sb", "sb", "sb", "sb", "sb", "sb", "s…
## $ occupancy_type <chr> "pr", "pr", "pr", "pr", "pr", "pr", "pr", "p…
## $ secured_by <chr> "home", "home", "home", "home", "home", "hom…
## $ total_units <chr> "1U", "1U", "1U", "1U", "1U", "1U", "1U", "1…
## $ income <dbl> 1740, 4980, 9480, 11880, 10440, 10080, 5040,…
## $ credit_type <chr> "EXP", "EQUI", "EXP", "EXP", "CRIF", "EXP", …
## $ credit_score <int> 758, 552, 834, 587, 602, 864, 860, 863, 580,…
## $ co.applicant_credit_type <chr> "CIB", "EXP", "CIB", "CIB", "EXP", "EXP", "E…
## $ age <chr> "25-34", "55-64", "35-44", "45-54", "25-34",…
## $ submission_of_application <chr> "to_inst", "to_inst", "to_inst", "not_inst",…
## $ ltv <dbl> 98.72881, NA, 80.01969, 69.37690, 91.88654, …
## $ region <chr> "south", "North", "south", "North", "North",…
## $ security_type <chr> "direct", "direct", "direct", "direct", "dir…
## $ status <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
## $ dtir1 <dbl> 45, NA, 46, 42, 39, 40, 44, 42, 44, 30, 44, …
colnames(df)
## [1] "id" "year"
## [3] "loan_limit" "gender"
## [5] "approv_in_adv" "loan_type"
## [7] "loan_purpose" "credit_worthiness"
## [9] "open_credit" "business_or_commercial"
## [11] "loan_amount" "rate_of_interest"
## [13] "interest_rate_spread" "upfront_charges"
## [15] "term" "neg_ammortization"
## [17] "interest_only" "lump_sum_payment"
## [19] "property_value" "construction_type"
## [21] "occupancy_type" "secured_by"
## [23] "total_units" "income"
## [25] "credit_type" "credit_score"
## [27] "co.applicant_credit_type" "age"
## [29] "submission_of_application" "ltv"
## [31] "region" "security_type"
## [33] "status" "dtir1"
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
df[df==""] <- NA
missmap(df)
char_col=c()
for(i in 1:34)
{
if(is.character(df[,i]))
{
char_col=c(char_col,i)
}
}
num_col=c()
for(i in 1:34)
{
if(is.numeric(df[,i]))
{
num_col=c(num_col,i)
}
}
length(num_col)
## [1] 13
length( char_col)
## [1] 21
sort(char_col)
## [1] 3 4 5 6 7 8 9 10 16 17 18 20 21 22 23 25 27 28 29 31 32
sort(num_col)
## [1] 1 2 11 12 13 14 15 19 24 26 30 33 34
charcol=c("loan_limit","gender","approv_in_adv","loan_type","loan_purpose","credit_worthiness","open_credit","business_or_commercial","neg_ammortization","interest_only","property_value","construction_type","occupancy_type","secured_by","total_units","co.applicant_credit_type","age","submission_of_application", "region","security_type","region")
length(charcol)
## [1] 21
numcol=c("id","year","loan_amount","rate_of_interest","interest_rate_spread","upfront_charges","term","lump_sum_payment","income","credit_score","ltv","status","dtir1")
length(numcol)
## [1] 13
for (i in char_col)
{
print(unique(df[,i]))
}
## [1] "cf" NA "ncf"
## [1] "Sex Not Available" "Male" "Joint"
## [4] "Female"
## [1] "nopre" "pre" NA
## [1] "type1" "type2" "type3"
## [1] "p1" "p4" "p3" "p2" NA
## [1] "l1" "l2"
## [1] "nopc" "opc"
## [1] "nob/c" "b/c"
## [1] "not_neg" "neg_amm" NA
## [1] "not_int" "int_only"
## [1] "not_lpsm" "lpsm"
## [1] "sb" "mh"
## [1] "pr" "sr" "ir"
## [1] "home" "land"
## [1] "1U" "2U" "3U" "4U"
## [1] "EXP" "EQUI" "CRIF" "CIB"
## [1] "CIB" "EXP"
## [1] "25-34" "55-64" "35-44" "45-54" "65-74" ">74" "<25" NA
## [1] "to_inst" "not_inst" NA
## [1] "south" "North" "central" "North-East"
## [1] "direct" "Indriect"
library(patchwork)
df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)],
as.factor)
gg3=df%>%ggplot(aes(df[,3]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg4=df%>%ggplot(aes(df[,4]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg5=df%>%ggplot(aes(df[,5]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg6=df%>%ggplot(aes(df[,6]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg7=df%>%ggplot(aes(df[,7]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg8=df%>%ggplot(aes(df[,8]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg9=df%>%ggplot(aes(df[,9]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg10=df%>%ggplot(aes(df[,10]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg16=df%>%ggplot(aes(df[,16]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg17=df%>%ggplot(aes(df[,17]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg18=df%>%ggplot(aes(df[,18]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg20=df%>%ggplot(aes(df[,20]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg21=df%>%ggplot(aes(df[,21]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg22=df%>%ggplot(aes(df[,22]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg23=df%>%ggplot(aes(df[,23]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg25=df%>%ggplot(aes(df[,25]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg27=df%>%ggplot(aes(df[,27]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg28=df%>%ggplot(aes(df[,28]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg29=df%>%ggplot(aes(df[,29]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg31=df%>%ggplot(aes(df[,31]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg32=df%>%ggplot(aes(df[,32]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
(gg3 |gg4| gg5 |gg6)/(gg7 |gg8| gg9 |gg10)
(gg16 |gg17| gg18 |gg20)/(gg21 |gg22| gg23 |gg25)
(gg27 |gg28| gg29 )/(gg31|gg32)
missing.values = df %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T)%>% select(-is.missing) %>%
arrange(desc(num.missing))
## Warning: attributes are not identical across measure variables; they will be
## dropped
## `summarise()` has grouped output by 'key'. You can override using the `.groups`
## argument.
missing.values
## # A tibble: 14 × 2
## # Groups: key [14]
## key num.missing
## <chr> <int>
## 1 upfront_charges 39642
## 2 interest_rate_spread 36639
## 3 rate_of_interest 36439
## 4 dtir1 24121
## 5 ltv 15098
## 6 property_value 15098
## 7 income 9150
## 8 loan_limit 3344
## 9 approv_in_adv 908
## 10 age 200
## 11 submission_of_application 200
## 12 loan_purpose 134
## 13 neg_ammortization 121
## 14 term 41
#
z=c()
for(i in 1:14)
{
for(j in 1:34)
{
x=colnames(df)
y=x[j]
if(y==missing.values[i,1])
z=c(z,j)
}
}
z%>%sort()
## [1] 3 5 7 12 13 14 15 16 19 24 28 29 30 34
df1=df
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
head(df1[,12])
## [1] NA NA 4.56 4.25 4.00 3.99
x_with_na=df1[,12]
x_with_imputations= na_kalman(df1[,12])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,12]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
head(df1[,13])
## [1] NA
## [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,13]
x_with_imputations= na_kalman(df1[,13])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,13]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
head(df1[,14])
## [1] NA
## [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,14]
x_with_imputations= na_kalman(df1[,14])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,14]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
head(df1[,15])
## [1] 360
## [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,15]
x_with_imputations= na_kalman(df1[,15])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,15]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 2.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
head(df1[,19])
## [1] 118000
## [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,19]
x_with_imputations= na_kalman(df1[,19])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,19]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
head(df1[,24])
## [1] 1740
## [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,24]
x_with_imputations= na_kalman(df1[,24])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,24]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
head(df1[,30])
## [1] 98.72881
## [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,30]
x_with_imputations= na_kalman(df1[,30])
df1[,30]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 2.5,
size_imputations = 0.5,
size_truth = 2.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
head(df1[,34])
## [1] 45
## [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,34]
x_with_imputations= na_kalman(df1[,34])
df1[,34]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
## id year loan_limit gender approv_in_adv loan_type loan_purpose
## credit_worthiness open_credit business_or_commercial loan_amount
## rate_of_interest interest_rate_spread upfront_charges term
## neg_ammortization interest_only lump_sum_payment property_value
## construction_type occupancy_type secured_by total_units income credit_type
## credit_score co.applicant_credit_type age submission_of_application ltv
## region security_type status dtir1 options(max.print = 1)
## [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())
options(max.print = 1000)
df1$status = as.factor(df1$status)
class(df1$status)
## [1] "factor"
df=df1
df=na.omit(df1)
summary(df)
## id year loan_limit gender
## Min. : 24890 Min. :2019 cf :134224 Female :26405
## 1st Qu.: 62036 1st Qu.:2019 ncf: 9759 Joint :40200
## Median : 99169 Median :2019 Male :40995
## Mean : 99210 Mean :2019 Sex Not Available:36383
## 3rd Qu.:136384 3rd Qu.:2019
## Max. :173559 Max. :2019
##
## approv_in_adv loan_type loan_purpose credit_worthiness open_credit
## nopre:121038 type1:109694 p1:33184 l1:137671 nopc:143444
## pre : 22945 type2: 19915 p2: 3150 l2: 6312 opc : 539
## type3: 14374 p3:54102
## p4:53547
##
##
##
## business_or_commercial loan_amount rate_of_interest interest_rate_spread
## b/c : 19915 Min. : 16500 Min. :0.000 Min. :-3.6380
## nob/c:124068 1st Qu.: 196500 1st Qu.:3.750 1st Qu.: 0.1781
## Median : 296500 Median :4.034 Median : 0.4374
## Mean : 331772 Mean :4.041 Mean : 0.4390
## 3rd Qu.: 436500 3rd Qu.:4.250 3rd Qu.: 0.6171
## Max. :3576500 Max. :8.000 Max. : 3.3570
##
## upfront_charges term neg_ammortization interest_only
## Min. : 0 Min. : 96.0 neg_amm: 14386 int_only: 6826
## 1st Qu.: 1250 1st Qu.:360.0 not_neg:129597 not_int :137157
## Median : 3163 Median :360.0
## Mean : 3228 Mean :335.1
## 3rd Qu.: 3901 3rd Qu.:360.0
## Max. :60000 Max. :360.0
##
## lump_sum_payment property_value construction_type occupancy_type
## lpsm : 3384 Min. : 8000 mh: 33 ir: 7053
## not_lpsm:140599 1st Qu.: 288000 sb:143950 pr:133903
## Median : 458000 sr: 3027
## Mean : 499248
## 3rd Qu.: 598000
## Max. :16508000
##
## secured_by total_units income credit_type credit_score
## home:143950 1U:141877 Min. : 0 CIB :46738 Min. :500.0
## land: 33 2U: 1432 1st Qu.: 3840 CRIF:42560 1st Qu.:599.0
## 3U: 371 Median : 6000 EQUI:14609 Median :699.0
## 4U: 303 Mean : 6962 EXP :40076 Mean :699.7
## 3rd Qu.: 8280 3rd Qu.:800.0
## Max. :578580 Max. :900.0
##
## co.applicant_credit_type age submission_of_application
## CIB:72056 <25 : 1295 not_inst:51069
## EXP:71927 >74 : 6987 to_inst :92914
## 25-34:18494
## 35-44:31816
## 45-54:33688
## 55-64:31550
## 65-74:20153
## ltv region security_type status
## Min. : 0.967 central : 8408 direct :143950 0:108713
## 1st Qu.: 63.142 North :72427 Indriect: 33 1: 35270
## Median : 73.534 North-East: 1207
## Mean : 72.701 south :61941
## 3rd Qu.: 84.574
## Max. :7831.250
##
## dtir1
## Min. : 5.00
## 1st Qu.:33.00
## Median :37.94
## Mean :37.72
## 3rd Qu.:44.00
## Max. :61.00
##
library(corrplot)
## corrplot 0.92 loaded
# Correlation matrix
loan_cor <- df %>%
select(where(is.numeric)) %>%
drop_na() %>%
cor()
## Warning in cor(.): the standard deviation is zero
corrplot(loan_cor, method = "circle", addCoef.col = 1, number.cex = 0.7)
loan_cor <- df %>%
select(c("loan_amount","property_value","income","loan_amount","interest_rate_spread","rate_of_interest","ltv")) %>%
drop_na() %>%
cor()
corrplot(loan_cor, method = "circle", addCoef.col = 1, number.cex = 0.7)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Plots
df %>%
select("loan_amount","income","age","property_value","rate_of_interest","interest_rate_spread")%>%
ggpairs(mapping = aes(color = df$status, alpha = 0.5))+theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library("gridExtra")
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>%
ggplot(aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
theme_bw() +
labs(x = "income", y = "loan_amount") +
theme(plot.background = element_rect(fill = "white"))
# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = "loan_amount") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "income", y = "loan_amount") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = "income") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>%
ggplot(aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
theme_bw() +
labs(x = "age", y = "property_value") +
theme(plot.background = element_rect(fill = "white"))
# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = "rate_of_interest") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = 'interest_rate_spread') +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "property_value", y = "loan_amount") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>%
ggplot(aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
theme_bw() +
labs(x = "age", y = "property_value") +
theme(plot.background = element_rect(fill = "white"))
# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "income", y = "property_value") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = 'property_value') +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = "rate_of_interest") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>%
ggplot(aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
theme_bw() +
labs(x = "age", y = "interest_rate_spread") +
theme(plot.background = element_rect(fill = "white"))
# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "loan_amount", y = "property_value") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = 'property_value') +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "loan_amount", y = "rate_of_interest") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>%
ggplot(aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
theme_bw() +
labs(x = "income", y = "rate_of_interest") +
theme(plot.background = element_rect(fill = "white"))
# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = "rate_of_interest") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "property_value", y = 'rate_of_interest') +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "loan_amount", y = "interest_rate_spread") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>%
ggplot(aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
theme_bw() +
labs(x = "income", y = "interest_rate_spread") +
theme(plot.background = element_rect(fill = "white"))
# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "age", y = "interest_rate_spread") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "property_value", y = 'interest_rate_spread') +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
labs(x = "rate_of_interest", y = "interest_rate_spread") +
theme_bw() +
theme(plot.background = element_rect(fill = "white"))
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
library(DataExplorer)
# Bar plots
plot_bar(df, by = "status",
ggtheme = theme_bw(),
nrow = 4,
ncol = 1,
parallel = TRUE)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
df %>%
plot_ly(
x = ~df$loan_amount,
y = ~df$rate_of_interest,
color = ~df$status,
colors = "Set2",
frame = ~df$age,
text = ~df$status,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
) %>%
plotly::layout(
xaxis = list(title = "Loan Amount"),
yaxis = list(title = "Loan Interest Rate")
)
plot_ly() %>%
add_trace(data = df,
y = ~df$rate_of_interest,
color = ~df$age,
colors = c("#f5ed04","#de1b1d"),
type = "box") %>%
plotly::layout(xaxis = list(title = "age"),
yaxis = list(title = "Loan Interest Rate"))
plot_ly() %>%
add_trace(data = df,
y = ~df$rate_of_interest,
color = ~df$status,
colors = "Dark2",
type = "box") %>%
plotly::layout(xaxis = list(title = "Historical Default"),
yaxis = list(title = "Loan Interest Rate"))
boxplot(df)
p1=boxplot(df$loan_amount , horizontal = TRUE, main = "loan_amount")
p2=boxplot(df$rate_of_interest , horizontal = TRUE, main = "rate_of_interest")
p3=boxplot(df$interest_rate_spread , horizontal = TRUE, main = "interest_rate_spread")
p4=boxplot(df$upfront_charges , horizontal = TRUE, main = "upfront_charges ")
p5=boxplot(df$term , horizontal = TRUE, main = "term")
p6=boxplot(df$property_value , horizontal = TRUE, main = "property_value ")
p7=boxplot(df$income , horizontal = TRUE, main = "income")
p8=boxplot(df$credit_score , horizontal = TRUE, main = "credit_score ")
p9=boxplot(df$ltv , horizontal = TRUE, main = "ltv ")
p10=boxplot(df$dtir1 , horizontal = TRUE, main = "dtir1 ")
data=df
dim(data)
## [1] 143983 34
for (i in 1:32)
{
if(is.numeric(df[,i]))
{
quartiles <- quantile(data[,i], probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(data[,i])
Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR
data_no_outlier <- subset(data, data[,i] > Lower & data[,i]< Upper)
data=data_no_outlier
}
}
dim(data_no_outlier)
## [1] 0 34
library(rsample)
##
## Attaching package: 'rsample'
## The following object is masked from 'package:Rcpp':
##
## populate
library(dplyr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
set.seed(222)
df=df[,-2]
df=df[,-1]
# Splitting the data into training and testing sets
df_split <- initial_split(df, prop = 0.75, strata = status)
train_set <- training(df_split)
test_set <- testing(df_split)
table_split <- data.frame(
Dataset = c("Training Set", "Testing Set"),
Count = c(count(train_set)[1,], count(test_set)[1,1])
)
kable(table_split) %>%
kable_styling(bootstrap_options = "bordered",
full_width = FALSE)
| Dataset | Count |
|---|---|
| Training Set | 107986 |
| Testing Set | 35997 |
logistics_classifier = glm(formula = status ~ .,
family = binomial,
data = train_set)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistics_classifier)
##
## Call:
## glm(formula = status ~ ., family = binomial, data = train_set)
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.056e+12 4.433e+12 0.689 0.490633
## loan_limitncf 7.173e-01 3.561e-02 20.145 < 2e-16 ***
## genderJoint -9.000e-03 3.922e-02 -0.229 0.818486
## genderMale 9.968e-02 2.783e-02 3.582 0.000341 ***
## genderSex Not Available 1.146e-02 3.553e-02 0.323 0.746975
## approv_in_advpre -2.380e-01 2.765e-02 -8.607 < 2e-16 ***
## loan_typetype2 7.704e-01 3.184e-02 24.193 < 2e-16 ***
## loan_typetype3 -7.120e-01 3.839e-02 -18.544 < 2e-16 ***
## loan_purposep2 9.600e-01 6.400e-02 14.999 < 2e-16 ***
## loan_purposep3 3.250e-01 2.955e-02 10.997 < 2e-16 ***
## loan_purposep4 1.303e-01 2.871e-02 4.540 5.63e-06 ***
## credit_worthinessl2 4.168e-01 4.412e-02 9.448 < 2e-16 ***
## open_creditopc -2.800e-01 1.870e-01 -1.497 0.134350
## business_or_commercialnob/c NA NA NA NA
## loan_amount 7.739e-08 1.114e-07 0.695 0.487238
## rate_of_interest 7.059e-02 2.952e-02 2.392 0.016778 *
## interest_rate_spread -9.329e-01 3.471e-02 -26.876 < 2e-16 ***
## upfront_charges -5.011e-05 3.893e-06 -12.872 < 2e-16 ***
## term -1.388e-03 1.917e-04 -7.242 4.43e-13 ***
## neg_ammortizationnot_neg -9.234e-01 2.710e-02 -34.070 < 2e-16 ***
## interest_onlynot_int -3.670e-01 4.342e-02 -8.453 < 2e-16 ***
## lump_sum_paymentnot_lpsm -2.599e+00 5.651e-02 -45.985 < 2e-16 ***
## property_value 4.627e-07 5.452e-08 8.487 < 2e-16 ***
## construction_typesb -3.056e+12 4.433e+12 -0.689 0.490633
## occupancy_typepr -1.041e+00 4.491e-02 -23.174 < 2e-16 ***
## occupancy_typesr -4.933e-01 7.607e-02 -6.484 8.90e-11 ***
## secured_byland -3.056e+12 4.433e+12 -0.689 0.490633
## total_units2U 8.218e-01 8.169e-02 10.060 < 2e-16 ***
## total_units3U 1.120e+00 1.535e-01 7.298 2.93e-13 ***
## total_units4U 4.294e-01 1.889e-01 2.273 0.023003 *
## income -5.234e-05 3.171e-06 -16.508 < 2e-16 ***
## credit_typeCRIF 4.764e-02 2.249e-02 2.118 0.034140 *
## credit_typeEQUI 1.256e+01 1.289e+00 9.744 < 2e-16 ***
## credit_typeEXP -2.498e-02 2.310e-02 -1.082 0.279429
## credit_score 1.305e-04 8.056e-05 1.620 0.105267
## co.applicant_credit_typeEXP -2.869e-01 2.844e-02 -10.087 < 2e-16 ***
## age>74 -5.844e-02 1.029e-01 -0.568 0.570160
## age25-34 -4.057e-01 9.691e-02 -4.186 2.84e-05 ***
## age35-44 -3.892e-01 9.588e-02 -4.059 4.92e-05 ***
## age45-54 -2.423e-01 9.590e-02 -2.527 0.011506 *
## age55-64 -1.450e-01 9.610e-02 -1.509 0.131218
## age65-74 -1.851e-01 9.753e-02 -1.898 0.057679 .
## submission_of_applicationto_inst 1.027e+00 2.786e-02 36.854 < 2e-16 ***
## ltv 1.932e-02 8.445e-04 22.882 < 2e-16 ***
## regionNorth -2.756e-01 3.897e-02 -7.072 1.53e-12 ***
## regionNorth-East 3.359e-02 9.768e-02 0.344 0.730938
## regionsouth -1.099e-01 4.296e-02 -2.559 0.010508 *
## security_typeIndriect NA NA NA NA
## dtir1 8.836e-03 1.012e-03 8.730 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 120238 on 107985 degrees of freedom
## Residual deviance: 74703 on 107939 degrees of freedom
## AIC: 74797
##
## Number of Fisher Scoring iterations: 25
prob_pred = predict(logistics_classifier, type = 'response', newdata = test_set)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
y_pred = ifelse(prob_pred > 0.5, 1, 0)
cm = table(ActualValue=test_set$status, PredictedValue=prob_pred > 0.5)
cm
## PredictedValue
## ActualValue FALSE TRUE
## 0 26919 260
## 1 4520 4298
sum(diag(cm))/sum(cm)
## [1] 0.8672112
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
##
## na.locf
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
Tree_Classifer = ctree(status ~ .,
data = train_set)
Tree_Classifer
##
## Conditional inference tree with 79 terminal nodes
##
## Response: status
## Inputs: loan_limit, gender, approv_in_adv, loan_type, loan_purpose, credit_worthiness, open_credit, business_or_commercial, loan_amount, rate_of_interest, interest_rate_spread, upfront_charges, term, neg_ammortization, interest_only, lump_sum_payment, property_value, construction_type, occupancy_type, secured_by, total_units, income, credit_type, credit_score, co.applicant_credit_type, age, submission_of_application, ltv, region, security_type, dtir1
## Number of observations: 107986
##
## 1) credit_type == {EQUI}; criterion = 1, statistic = 37684.315
## 2) rate_of_interest <= 4.088513; criterion = 1, statistic = 5491
## 3)* weights = 10873
## 2) rate_of_interest > 4.088513
## 4) dtir1 <= 37.65324; criterion = 1, statistic = 110
## 5)* weights = 7
## 4) dtir1 > 37.65324
## 6)* weights = 104
## 1) credit_type == {CIB, CRIF, EXP}
## 7) lump_sum_payment == {lpsm}; criterion = 1, statistic = 3177.545
## 8) submission_of_application == {not_inst}; criterion = 1, statistic = 57.953
## 9) upfront_charges <= 2762.44; criterion = 1, statistic = 72.462
## 10)* weights = 188
## 9) upfront_charges > 2762.44
## 11) upfront_charges <= 3550.096; criterion = 1, statistic = 112.678
## 12) rate_of_interest <= 3.75; criterion = 1, statistic = 80.85
## 13)* weights = 7
## 12) rate_of_interest > 3.75
## 14) rate_of_interest <= 4.085726; criterion = 1, statistic = 65.9
## 15) interest_rate_spread <= 0.4149882; criterion = 1, statistic = 60.867
## 16)* weights = 7
## 15) interest_rate_spread > 0.4149882
## 17)* weights = 258
## 14) rate_of_interest > 4.085726
## 18)* weights = 7
## 11) upfront_charges > 3550.096
## 19)* weights = 40
## 8) submission_of_application == {to_inst}
## 20) approv_in_adv == {pre}; criterion = 1, statistic = 59.865
## 21)* weights = 23
## 20) approv_in_adv == {nopre}
## 22) upfront_charges <= 3521.373; criterion = 1, statistic = 37.086
## 23) upfront_charges <= 2946.86; criterion = 1, statistic = 534.417
## 24)* weights = 126
## 23) upfront_charges > 2946.86
## 25) interest_rate_spread <= 0.47906; criterion = 1, statistic = 188.487
## 26) interest_rate_spread <= 0.400631; criterion = 1, statistic = 353.412
## 27)* weights = 10
## 26) interest_rate_spread > 0.400631
## 28) rate_of_interest <= 4.086297; criterion = 1, statistic = 364.512
## 29)* weights = 824
## 28) rate_of_interest > 4.086297
## 30)* weights = 10
## 25) interest_rate_spread > 0.47906
## 31)* weights = 23
## 22) upfront_charges > 3521.373
## 32)* weights = 146
## 7) lump_sum_payment == {not_lpsm}
## 33) neg_ammortization == {neg_amm}; criterion = 1, statistic = 1950.56
## 34) credit_worthiness == {l2}; criterion = 1, statistic = 1061.948
## 35) rate_of_interest <= 4.541252; criterion = 1, statistic = 496.984
## 36)* weights = 492
## 35) rate_of_interest > 4.541252
## 37)* weights = 26
## 34) credit_worthiness == {l1}
## 38) rate_of_interest <= 4.106765; criterion = 1, statistic = 762.709
## 39) rate_of_interest <= 4; criterion = 1, statistic = 1874.547
## 40) occupancy_type == {ir}; criterion = 1, statistic = 33.595
## 41)* weights = 28
## 40) occupancy_type == {pr, sr}
## 42) rate_of_interest <= 3.99; criterion = 0.993, statistic = 13.716
## 43) age == {<25, 25-34, 45-54}; criterion = 1, statistic = 39.722
## 44) age == {<25}; criterion = 0.951, statistic = 12.852
## 45)* weights = 12
## 44) age == {25-34, 45-54}
## 46)* weights = 684
## 43) age == {>74, 35-44, 55-64, 65-74}
## 47)* weights = 1113
## 42) rate_of_interest > 3.99
## 48) rate_of_interest <= 3.998689; criterion = 1, statistic = 52.107
## 49)* weights = 9
## 48) rate_of_interest > 3.998689
## 50)* weights = 62
## 39) rate_of_interest > 4
## 51) interest_rate_spread <= 0.4111201; criterion = 1, statistic = 887.192
## 52) interest_rate_spread <= 0.3693723; criterion = 1, statistic = 21.589
## 53)* weights = 7
## 52) interest_rate_spread > 0.3693723
## 54)* weights = 17
## 51) interest_rate_spread > 0.4111201
## 55)* weights = 2270
## 38) rate_of_interest > 4.106765
## 56) rate_of_interest <= 4.241421; criterion = 1, statistic = 19.05
## 57) rate_of_interest <= 4.18; criterion = 1, statistic = 94.922
## 58) loan_purpose == {p2, p4}; criterion = 1, statistic = 49.019
## 59)* weights = 74
## 58) loan_purpose == {p1, p3}
## 60)* weights = 205
## 57) rate_of_interest > 4.18
## 61)* weights = 8
## 56) rate_of_interest > 4.241421
## 62) business_or_commercial == {b/c}; criterion = 0.998, statistic = 15.896
## 63) interest_rate_spread <= 0.4712; criterion = 1, statistic = 26.003
## 64)* weights = 7
## 63) interest_rate_spread > 0.4712
## 65)* weights = 468
## 62) business_or_commercial == {nob/c}
## 66)* weights = 3659
## 33) neg_ammortization == {not_neg}
## 67) submission_of_application == {to_inst}; criterion = 1, statistic = 1177.585
## 68) loan_type == {type2}; criterion = 1, statistic = 684.472
## 69) interest_rate_spread <= 0.4789281; criterion = 1, statistic = 3261.762
## 70) rate_of_interest <= 3.99; criterion = 1, statistic = 2150.126
## 71) rate_of_interest <= 3.875; criterion = 1, statistic = 71.159
## 72)* weights = 446
## 71) rate_of_interest > 3.875
## 73) rate_of_interest <= 3.989792; criterion = 0.994, statistic = 13.857
## 74)* weights = 15
## 73) rate_of_interest > 3.989792
## 75)* weights = 15
## 70) rate_of_interest > 3.99
## 76) upfront_charges <= 2695.539; criterion = 1, statistic = 1480.899
## 77) interest_rate_spread <= 0.3485905; criterion = 0.998, statistic = 16.438
## 78)* weights = 12
## 77) interest_rate_spread > 0.3485905
## 79)* weights = 12
## 76) upfront_charges > 2695.539
## 80)* weights = 2201
## 69) interest_rate_spread > 0.4789281
## 81)* weights = 5834
## 68) loan_type == {type1, type3}
## 82) upfront_charges <= 3484.874; criterion = 1, statistic = 841.207
## 83) upfront_charges <= 3000.37; criterion = 1, statistic = 6023.67
## 84) upfront_charges <= 2944; criterion = 1, statistic = 186.606
## 85) upfront_charges <= 1182.32; criterion = 0.996, statistic = 14.568
## 86)* weights = 5056
## 85) upfront_charges > 1182.32
## 87) loan_amount <= 466500; criterion = 1, statistic = 21.828
## 88) income <= 18840; criterion = 0.973, statistic = 12.336
## 89)* weights = 7459
## 88) income > 18840
## 90)* weights = 77
## 87) loan_amount > 466500
## 91)* weights = 414
## 84) upfront_charges > 2944
## 92) ltv <= 83.29918; criterion = 1, statistic = 34.69
## 93) loan_amount <= 236500; criterion = 1, statistic = 23.359
## 94) interest_rate_spread <= 0.4481444; criterion = 0.989, statistic = 12.778
## 95) interest_rate_spread <= 0.4268; criterion = 0.992, statistic = 13.303
## 96)* weights = 59
## 95) interest_rate_spread > 0.4268
## 97)* weights = 25
## 94) interest_rate_spread > 0.4481444
## 98)* weights = 198
## 93) loan_amount > 236500
## 99)* weights = 114
## 92) ltv > 83.29918
## 100)* weights = 103
## 83) upfront_charges > 3000.37
## 101) interest_rate_spread <= 0.4788959; criterion = 1, statistic = 956.05
## 102) interest_rate_spread <= 0.4065; criterion = 1, statistic = 3512.229
## 103) occupancy_type == {ir, sr}; criterion = 1, statistic = 108.952
## 104)* weights = 44
## 103) occupancy_type == {pr}
## 105) interest_rate_spread <= 0.3909; criterion = 0.958, statistic = 148.516
## 106)* weights = 1426
## 105) interest_rate_spread > 0.3909
## 107)* weights = 69
## 102) interest_rate_spread > 0.4065
## 108) rate_of_interest <= 3.99; criterion = 1, statistic = 682.975
## 109) interest_rate_spread <= 0.4100713; criterion = 1, statistic = 32.033
## 110)* weights = 19
## 109) interest_rate_spread > 0.4100713
## 111)* weights = 144
## 108) rate_of_interest > 3.99
## 112) rate_of_interest <= 4.096569; criterion = 1, statistic = 4946.938
## 113)* weights = 6507
## 112) rate_of_interest > 4.096569
## 114)* weights = 126
## 101) interest_rate_spread > 0.4788959
## 115)* weights = 2930
## 82) upfront_charges > 3484.874
## 116) upfront_charges <= 3548.442; criterion = 1, statistic = 173.011
## 117) loan_amount <= 236500; criterion = 0.992, statistic = 13.273
## 118)* weights = 338
## 117) loan_amount > 236500
## 119)* weights = 148
## 116) upfront_charges > 3548.442
## 120)* weights = 20473
## 67) submission_of_application == {not_inst}
## 121) interest_rate_spread <= 0.4089; criterion = 1, statistic = 609.037
## 122) business_or_commercial == {nob/c}; criterion = 1, statistic = 53.747
## 123) interest_rate_spread <= 0.4002; criterion = 1, statistic = 36.016
## 124) interest_rate_spread <= 0.3275; criterion = 1, statistic = 22.918
## 125) loan_purpose == {p1, p4}; criterion = 0.999, statistic = 22.866
## 126)* weights = 15022
## 125) loan_purpose == {p2, p3}
## 127) occupancy_type == {ir}; criterion = 0.996, statistic = 17.991
## 128)* weights = 61
## 127) occupancy_type == {pr, sr}
## 129)* weights = 3528
## 124) interest_rate_spread > 0.3275
## 130) dtir1 <= 50; criterion = 0.981, statistic = 11.712
## 131)* weights = 1575
## 130) dtir1 > 50
## 132)* weights = 17
## 123) interest_rate_spread > 0.4002
## 133)* weights = 206
## 122) business_or_commercial == {b/c}
## 134) rate_of_interest <= 3.99; criterion = 1, statistic = 63.091
## 135) rate_of_interest <= 3.875; criterion = 0.994, statistic = 13.843
## 136)* weights = 548
## 135) rate_of_interest > 3.875
## 137)* weights = 11
## 134) rate_of_interest > 3.99
## 138)* weights = 7
## 121) interest_rate_spread > 0.4089
## 139) interest_rate_spread <= 0.4696689; criterion = 1, statistic = 2284.793
## 140) upfront_charges <= 2925.97; criterion = 1, statistic = 465.181
## 141)* weights = 941
## 140) upfront_charges > 2925.97
## 142) upfront_charges <= 3546.892; criterion = 1, statistic = 1264.361
## 143) rate_of_interest <= 3.99; criterion = 0.997, statistic = 15.249
## 144) interest_rate_spread <= 0.4138774; criterion = 0.998, statistic = 16.29
## 145)* weights = 9
## 144) interest_rate_spread > 0.4138774
## 146)* weights = 23
## 143) rate_of_interest > 3.99
## 147) rate_of_interest <= 4.090069; criterion = 1, statistic = 1206.023
## 148) rate_of_interest <= 4.004944; criterion = 0.991, statistic = 13.22
## 149)* weights = 27
## 148) rate_of_interest > 4.004944
## 150)* weights = 2520
## 147) rate_of_interest > 4.090069
## 151) interest_rate_spread <= 0.4514; criterion = 0.996, statistic = 14.883
## 152)* weights = 14
## 151) interest_rate_spread > 0.4514
## 153)* weights = 13
## 142) upfront_charges > 3546.892
## 154)* weights = 246
## 139) interest_rate_spread > 0.4696689
## 155) interest_rate_spread <= 0.4789182; criterion = 1, statistic = 39.409
## 156)* weights = 183
## 155) interest_rate_spread > 0.4789182
## 157)* weights = 6977
plot(Tree_Classifer)
# Prediction using the Decision Tree
pred = predict(Tree_Classifer,newdata = test_set)
cm = table(ActualValue=test_set$status, PredictedValue=pred)
cm
## PredictedValue
## ActualValue 0 1
## 0 27167 12
## 1 94 8724
sum(diag(cm))/sum(cm)
## [1] 0.9970553
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(222)
rf_classifier <- randomForest(status ~ ., data = train_set)
str(rf_classifier)
## List of 19
## $ call : language randomForest(formula = status ~ ., data = train_set)
## $ type : chr "classification"
## $ predicted : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:107986] "3" "4" "5" "6" ...
## $ err.rate : num [1:500, 1:3] 0.00737 0.00705 0.00584 0.00528 0.0048 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:3] "OOB" "0" "1"
## $ confusion : num [1:2, 1:3] 81534 99 0 26353 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "0" "1"
## .. ..$ : chr [1:3] "0" "1" "class.error"
## $ votes : 'matrix' num [1:107986, 1:2] 1 1 1 1 1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:107986] "3" "4" "5" "6" ...
## .. ..$ : chr [1:2] "0" "1"
## $ oob.times : num [1:107986] 193 185 187 198 206 180 181 187 178 178 ...
## $ classes : chr [1:2] "0" "1"
## $ importance : num [1:31, 1] 19.6 38.9 18.3 214.1 94.9 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
## .. ..$ : chr "MeanDecreaseGini"
## $ importanceSD : NULL
## $ localImportance: NULL
## $ proximity : NULL
## $ ntree : num 500
## $ mtry : num 5
## $ forest :List of 14
## ..$ ndbigtree : int [1:500] 1133 1169 799 825 1399 1369 1307 841 1045 691 ...
## ..$ nodestatus: int [1:2761, 1:500] 1 1 1 1 1 1 1 -1 1 1 ...
## ..$ bestvar : int [1:2761, 1:500] 11 22 11 12 11 23 11 0 29 23 ...
## ..$ treemap : int [1:2761, 1:2, 1:500] 2 4 6 8 10 12 14 0 16 18 ...
## ..$ nodepred : int [1:2761, 1:500] 0 0 0 0 0 0 0 2 0 0 ...
## ..$ xbestsplit: num [1:2761, 1:500] 0.408 330 0.47 3218.194 0.406 ...
## ..$ pid : num [1:2] 1 1
## ..$ cutoff : num [1:2] 0.5 0.5
## ..$ ncat : Named int [1:31] 2 4 2 3 4 2 2 2 1 1 ...
## .. ..- attr(*, "names")= chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
## ..$ maxcat : int 7
## ..$ nrnodes : int 2761
## ..$ ntree : num 500
## ..$ nclass : int 2
## ..$ xlevels :List of 31
## .. ..$ loan_limit : chr [1:2] "cf" "ncf"
## .. ..$ gender : chr [1:4] "Female" "Joint" "Male" "Sex Not Available"
## .. ..$ approv_in_adv : chr [1:2] "nopre" "pre"
## .. ..$ loan_type : chr [1:3] "type1" "type2" "type3"
## .. ..$ loan_purpose : chr [1:4] "p1" "p2" "p3" "p4"
## .. ..$ credit_worthiness : chr [1:2] "l1" "l2"
## .. ..$ open_credit : chr [1:2] "nopc" "opc"
## .. ..$ business_or_commercial : chr [1:2] "b/c" "nob/c"
## .. ..$ loan_amount : num 0
## .. ..$ rate_of_interest : num 0
## .. ..$ interest_rate_spread : num 0
## .. ..$ upfront_charges : num 0
## .. ..$ term : num 0
## .. ..$ neg_ammortization : chr [1:2] "neg_amm" "not_neg"
## .. ..$ interest_only : chr [1:2] "int_only" "not_int"
## .. ..$ lump_sum_payment : chr [1:2] "lpsm" "not_lpsm"
## .. ..$ property_value : num 0
## .. ..$ construction_type : chr [1:2] "mh" "sb"
## .. ..$ occupancy_type : chr [1:3] "ir" "pr" "sr"
## .. ..$ secured_by : chr [1:2] "home" "land"
## .. ..$ total_units : chr [1:4] "1U" "2U" "3U" "4U"
## .. ..$ income : num 0
## .. ..$ credit_type : chr [1:4] "CIB" "CRIF" "EQUI" "EXP"
## .. ..$ credit_score : num 0
## .. ..$ co.applicant_credit_type : chr [1:2] "CIB" "EXP"
## .. ..$ age : chr [1:7] "<25" ">74" "25-34" "35-44" ...
## .. ..$ submission_of_application: chr [1:2] "not_inst" "to_inst"
## .. ..$ ltv : num 0
## .. ..$ region : chr [1:4] "central" "North" "North-East" "south"
## .. ..$ security_type : chr [1:2] "direct" "Indriect"
## .. ..$ dtir1 : num 0
## $ y : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:107986] "3" "4" "5" "6" ...
## $ test : NULL
## $ inbag : NULL
## $ terms :Classes 'terms', 'formula' language status ~ loan_limit + gender + approv_in_adv + loan_type + loan_purpose + credit_worthiness + open_credit + | __truncated__ ...
## .. ..- attr(*, "variables")= language list(status, loan_limit, gender, approv_in_adv, loan_type, loan_purpose, credit_worthiness, open_credit, bus| __truncated__ ...
## .. ..- attr(*, "factors")= int [1:32, 1:31] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:32] "status" "loan_limit" "gender" "approv_in_adv" ...
## .. .. .. ..$ : chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
## .. ..- attr(*, "term.labels")= chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
## .. ..- attr(*, "order")= int [1:31] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "intercept")= num 0
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(status, loan_limit, gender, approv_in_adv, loan_type, loan_purpose, credit_worthiness, open_credit, bus| __truncated__ ...
## .. ..- attr(*, "dataClasses")= Named chr [1:32] "factor" "factor" "factor" "factor" ...
## .. .. ..- attr(*, "names")= chr [1:32] "status" "loan_limit" "gender" "approv_in_adv" ...
## - attr(*, "class")= chr [1:2] "randomForest.formula" "randomForest"
attributes(rf_classifier)
## $names
## [1] "call" "type" "predicted" "err.rate"
## [5] "confusion" "votes" "oob.times" "classes"
## [9] "importance" "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest" "y"
## [17] "test" "inbag" "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
rf_pred = predict(rf_classifier,test_set)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
confusionMatrix(rf_pred,test_set$status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 27179 32
## 1 0 8786
##
## Accuracy : 0.9991
## 95% CI : (0.9987, 0.9994)
## No Information Rate : 0.755
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9976
##
## Mcnemar's Test P-Value : 4.251e-08
##
## Sensitivity : 1.0000
## Specificity : 0.9964
## Pos Pred Value : 0.9988
## Neg Pred Value : 1.0000
## Prevalence : 0.7550
## Detection Rate : 0.7550
## Detection Prevalence : 0.7559
## Balanced Accuracy : 0.9982
##
## 'Positive' Class : 0
##
plot(rf_classifier)
varImpPlot(rf_classifier)
importance(rf_classifier)
## MeanDecreaseGini
## loan_limit 1.960654e+01
## gender 3.890623e+01
## approv_in_adv 1.834797e+01
## loan_type 2.141204e+02
## loan_purpose 9.492187e+01
## credit_worthiness 2.690797e+01
## open_credit 1.084549e+01
## business_or_commercial 1.325256e+02
## loan_amount 1.752984e+02
## rate_of_interest 1.306849e+04
## interest_rate_spread 1.001180e+04
## upfront_charges 7.267285e+03
## term 8.797034e+01
## neg_ammortization 1.224602e+02
## interest_only 8.573198e+00
## lump_sum_payment 3.127223e+02
## property_value 7.396553e+02
## construction_type 1.740026e-01
## occupancy_type 1.863543e+01
## secured_by 1.349490e-01
## total_units 4.530986e+00
## income 2.824626e+02
## credit_type 5.423771e+03
## credit_score 5.258760e+01
## co.applicant_credit_type 1.425788e+02
## age 4.127812e+01
## submission_of_application 1.062712e+02
## ltv 7.201919e+02
## region 1.598862e+01
## security_type 1.851919e-01
## dtir1 7.657246e+02